'''
Created on 28/01/2012

@author: christian
http://docs.python.org/library/urlparse.html
http://docs.python.org/tutorial/errors.html
http://docs.python.org/library/urllib2.html
http://docs.python.org/library/urllib2.html#urllib2.Request
http://www.crummy.com/software/BeautifulSoup/documentation.html
http://blog.elcodiguero.com/python/23-eliminar-duplicados-lista.html
http://www.mail-archive.com/dnspython-users@dnspython.org/msg00006.html
http://www.dnspython.org/examples.html
http://docs.python.org/release/2.5.2/lib/typesmapping.html
'''
from urlparse import urlparse
import urllib2
import BeautifulSoup
import dns.resolver

def valida_url(url):
    '''
    
    @param url:
    @type url:
    '''
    #si la url no empieza por http temp https lo anexa al inicio
    if url[:7].lower() != "http://" or url[:8].lower() != "https://":
        url = "http://" + url
    #verificamos que la url es valida
    temp = urlparse(url)
    if temp.hostname is None:
        url = None
    return url

# pylint: disable-msg=W0232
class _MyRedirects(urllib2.HTTPRedirectHandler):
    '''
    Clase de redireccion
    '''
# pylint: disable-msg=C0301,R0913,C0111
    def http_error_301(self, req, fpc, code, msg, headers):
        print code, " http_error_301 ", headers['Location']
        return urllib2.HTTPRedirectHandler.http_error_301(self, req, fpc, code, msg, headers)
# pylint: disable-msg=C0301,R0913,C0111
    def http_error_302(self, req, fpc, code, msg, headers):
        print code, " http_error_302 ", headers['Location']
        return urllib2.HTTPRedirectHandler.http_error_302(self, req, fpc, code, msg, headers)
# pylint: disable-msg=C0301,R0913,C0111
    def http_error_303(self, req, fpc, code, msg, headers):
        print code, " http_error_303 ", headers['Location']
        return urllib2.HTTPRedirectHandler.http_error_303(self, req, fpc, code, msg, headers)
# pylint: disable-msg=C0301,R0913,C0111
    def http_error_307(self, req, fpc, code, msg, headers):
        print code, " http_error_304 ", headers['Location']
        return urllib2.HTTPRedirectHandler.http_error_307(self, req, fpc, code, msg, headers)

class HtmlExtractor(object):
    '''
    Clase que extrae el codigo HTML de un origen.
    '''
    def __init__(self, url):
        '''
        
        @param url: URL completa, no acepta rutas relativas o absolutas
        @type url: string
        '''
        self.follow_redirects = False
        self.html_body = None
        self.url = valida_url(url)
        if self.url is None:
            raise ValueError("URL No valida")


    def get_url(self):
        '''
        retorna la url parseada, por ejemplo:
        para www.google.com.pe retorna http://www.google.com.pe
        '''
        return self.url


    def get_body(self, follow_redirects = False):
        '''
        Retorna el html resultante de la peticion.
        
        @param follow_redirects: En caso de estar seteado a verdadero, interpreta los mensajes HTTP 301, 302
        @type follow_redirects: boolean
        '''
        self.follow_redirects = follow_redirects
        request = urllib2.Request(self.url)
        opener = urllib2.build_opener(_MyRedirects())
        content = opener.open(request)
        self.html_body = content.read()
        return self.html_body





class DataExtractor(object):
    '''
    Clase para extraer Datos de un html.
    
    url -- direccion desde la cual se extrajo el html, opcional si se desea que se 
    interpreten las rutas relativas o absolutas que se puedan encontrar en el html
    
    only_href -- Solamente devuelve el contenido de los href, el texto entre las etiquetas <a> no necesariamente puede coincidir con el valor
    '''
    def __init__(self, html_body, url = None, only_href = False):
        self.html_body = html_body
        self.url = url
        self.only_href = only_href
        self.urls = []
        self.domains = []
        self.domain_ips = {}


    def get_urls(self):
        '''
        devuelve todas las URL encontradas en el html analizado
        '''
        result = BeautifulSoup.BeautifulSoup(self.html_body)
        lst_tag = result.findAll("a")
        for c_url in lst_tag:
            self.urls.append(c_url["href"])
        return self.urls


    def get_domains(self, urls = None):
        '''
        
        @param urls: Listado de urls a extraer los dominios, es la informacion devuelta por
        get_urls()
        @type urls: list
        '''
        if urls is None:
            self.urls = self.get_urls()
        for url in urls:
            temp = urlparse(url)
            if temp.hostname is None:
                continue
            self.domains.append(temp.hostname)
        return dict.fromkeys(self.domains).keys()


    def get_ips_for_domains(self, domains = None):
        '''
        
        @param domains: Listado de dominios a extraer los IP, es la informacion devuelta por
        get_domains()
        @type domains: list
        '''
        if domains is None:
            self.domains = self.get_domains()
        for domain in domains:
            answer = dns.resolver.query(domain, 'A')
            temp = []
            for rdata in answer:
                temp.append(rdata.address)
            self.domain_ips[domain] = temp
        return self.domain_ips
